%/***************************************************************************************
%Firm-embedded productivity and cross-country income differences
%Alviarez, Cravino and Ramondo
%Journal of Political Economy (2022)

%Program: table_C1_contribution_variance.do
%Date: October 2022

%Description: Reproduces Table C.1: Contribution to Var sni_j(w)
%*****************************************************************************************/


%-----------------------------------------------------------
%K-means preparation (for isocode): 
%-----------------------------------------------------------
clear; 
cd '.../dataverse_files/data/analysis'
type='naics';
type2='sales';

if strcmp(type,'naics') &&  strcmp(type2,'sales') 
    T = readtable('LCS_prereg_naics_sales_s1_base_woparent_2016.csv'); %Imports the file from Stata
end

data_allsec = table2array(T); 
KMEANS=ones(1,3); 
a=data_allsec(:,1);
yy_min=min(a)
yy_max=max(a)
a=data_allsec(:,2);
sec_max=max(a)

for sec=1:sec_max
sec
%From Stata: order year sss guo iso sin_sales lhs2
ind1 = data_allsec(:,2) ==sec;
data = data_allsec(ind1,:);
data=[data(:,4) data(:,6)];

%Define the size for the grid 
nw = 20;
k = 6;
start1=1/(nw+1); 
end1=nw/(nw+1); 

data = sortrows(data,[1 2]); 
dep_var=data(:,2); 
firmid=data(:,1); 
firm_unique=unique(firmid(:,1)); 
N=size(firm_unique,1); 

y_grid = linspace(start1,end1,nw); 
y_qtile = quantile(dep_var,y_grid); 

for i=1:nw
i
ind1=data(:,2)<=y_qtile(:,i);
data_temp1 = data(ind1,:);
id=data_temp1(:,1);
id_unique=unique(data_temp1(:,1));
value=ones(size(data_temp1(:,1),1),1); 
data_temp1b = cell2mat(accumarray(id,value,[],@(x){sum(x)})); 
data_qtile{i}=[id_unique data_temp1b]; 
end

data_qtile_matrix=firm_unique; 
for i=1:nw
i
A=[firm_unique firm_unique]; 
B=data_qtile{i}; 
q = unique([A(:,1);B(:,1)]); % Unique sorted concatenate.
aInd = ismember(q,A(:,1)); % All dates in A
bInd = ismember(q,B(:,1)); % All dates in b
x = zeros(length(q),3);
x(:,1) = q;
x(aInd,2) = A(:,2);
x(not(aInd),2) = NaN;
x(bInd,3) = B(:,2);
x(not(bInd),3) = NaN;
data_qtile_matrix(:,i+1)=x(:,end); 

end

value=ones(size(data(:,1),1),1); 
N_f = cell2mat(accumarray(firmid,value,[],@(x){sum(x)})); 
data_qtile_matrix = [data_qtile_matrix(:,1) bsxfun(@rdivide, data_qtile_matrix(:,2:end), N_f)];
data_qtile_matrix(isnan(data_qtile_matrix))=0; 


%K-means implementation: 
%-----------------------------------------------------------
X=data_qtile_matrix(:,2:end); 
tic; 
idx = kmeans(X,k,'MaxIter',10000,'Display','final','Replicates',5000);
toc

XX=[data_qtile_matrix(:,1) idx X];
yy=[sec*ones(size(idx,1),1)  data_qtile_matrix(:,1) idx];
KMEANS=[KMEANS; yy];
end 
KMEANS=KMEANS(2:end,:);

K1=array2table(KMEANS);
KMEANS3=K1;

%Exporting the data to a csv file: 
set=KMEANS3;
if strcmp(type,'naics') &&  strcmp(type2,'sales') 
writetable(set,'kmeans_prereg_naics_sales_s1_base_fromMatlab.csv')
end









